###########################################
### Project: Where do parties interact? ###
### Task:    STM - Validation           ###
### Title:   STM_Validation.R           ###
###########################################

#--------------------------------------------------------------------------------------------------
# Description:
#
# This script performs validation checks of the Structural Topic Model (STM) with the R package 
# 'oolong' (source: https://github.com/chainsawriot/oolong/blob/master/overview_gh.md).
#--------------------------------------------------------------------------------------------------

#---------------------------------------------------------------------------------------------------------------------

# Load packages

library(stm)
library(tibble)
library(dplyr)
library(quanteda)
library(oolong)

#---------------------------------------------------------------------------------------------------------------------

# Load preprocessed corpus 
# Unfortunately, the pre-processed data cannot be published in the replication material as it includes full texts

# load("STM_Preprocessing_Data.RDA")

#---------------------------------------------------------------------------------------------------------------------

# Load STM

load(file = "STM_Model.RDA")

#---------------------------------------------------------------------------------------------------------------------

# Word and topic intrusion tests

# Uncomment to perform validation steps

## STM (k = 80)
# set.seed(1000)
# witi_test_80 <- witi(stm_80,
#                      meta$text,
#                      n_top_topics = 3,
#                      exact_n = 50,
#                      use_frex_words = FALSE,
#                      userid = "Christoph")
# 
# save(witi_test_80, file = "witi_test_80_uncoded.RDA")
# 
# witi_test_80$do_word_intrusion_test()
# witi_test_80$do_topic_intrusion_test()
# 
# save(witi_test_80, file = "witi_test_80_in-progress.RDA")
# 
# witi_test_80$lock()
# 
# summarize_oolong(witi_test_80)
# 
# save(witi_test_80, file = "witi_test_80_coded.RDA")
# 
# ## STM (k = 85)
# set.seed(1000)
# witi_test_85 <- witi(stm_85,
#                      meta$text,
#                      n_top_topics = 3,
#                      exact_n = 50,
#                      use_frex_words = FALSE,
#                      userid = "Christoph")
# 
# save(witi_test_85, file = "witi_test_85_uncoded.RDA")
# 
# witi_test_85$do_word_intrusion_test()
# witi_test_85$do_topic_intrusion_test()
# 
# save(witi_test_85, file = "witi_test_85_in-progress.RDA")
# 
# witi_test_85$lock()
# 
# summarize_oolong(witi_test_85)
# 
# save(witi_test_85, file = "witi_test_85_coded.RDA")
# 
# ## STM (k = 90)
# set.seed(1000)
# witi_test_90 <- witi(stm_90,
#                      meta$text,
#                      n_top_topics = 3,
#                      exact_n = 50,
#                      use_frex_words = FALSE,
#                      userid = "Christoph")
# 
# save(witi_test_90, file = "witi_test_90_uncoded.RDA")
# 
# witi_test_90$do_word_intrusion_test()
# witi_test_90$do_topic_intrusion_test()
# 
# save(witi_test_90, file = "witi_test_90_in-progress.RDA")
# 
# witi_test_90$lock()
# 
# summarize_oolong(witi_test_90)
# 
# save(witi_test_90, file = "witi_test_90_coded.RDA")
# 
# ## STM (k = 95)
# set.seed(1000)
# witi_test_95 <- witi(stm_95,
#                      meta$text,
#                      n_top_topics = 3,
#                      exact_n = 50,
#                      use_frex_words = FALSE,
#                      userid = "Christoph")
# 
# save(witi_test_95, file = "witi_test_95_uncoded.RDA")
# 
# witi_test_95$do_word_intrusion_test()
# witi_test_95$do_topic_intrusion_test()
# 
# save(witi_test_95, file = "witi_test_95_in-progress.RDA")
# 
# witi_test_95$lock()
# 
# summarize_oolong(witi_test_95)
# 
# save(witi_test_95, file = "witi_test_95_coded.RDA")
# 
# ## STM (k = 100)
# set.seed(1000)
# witi_test_100 <- witi(stm_100,
#                       meta$text,
#                       n_top_topics = 3,
#                       exact_n = 50,
#                       use_frex_words = FALSE,
#                       userid = "Christoph")
# 
# save(witi_test_100, file = "witi_test_100_uncoded.RDA")
# 
# witi_test_100$do_word_intrusion_test()
# witi_test_100$do_topic_intrusion_test()
# 
# save(witi_test_100, file = "witi_test_100_in-progress.RDA")
# 
# witi_test_100$lock()
# 
# summarize_oolong(witi_test_100)
# 
# save(witi_test_100, file = "witi_test_100_coded.RDA")

#---------------------------------------------------------------------------------------------------------------------

# Validation statistics

## re-load coded word and topic intrusion tests

load("witi_test_80_coded.RDA")
load("witi_test_85_coded.RDA")
load("witi_test_90_coded.RDA")
load("witi_test_95_coded.RDA")
load("witi_test_100_coded.RDA")

## summary per model
summary_80 <- summarize_oolong(witi_test_80)

summary_85 <- summarize_oolong(witi_test_85)

summary_90 <- summarize_oolong(witi_test_90)

summary_95 <- summarize_oolong(witi_test_95)

summary_100 <- summarize_oolong(witi_test_100)

#---------------------------------------------------------------------------------------------------------------------

# Model choice

df_summary <- data.frame(
  k = c(80, 85, 90, 95, 100),
  mean_model_precision = c(summary_80$rater_precision, summary_85$rater_precision, summary_90$rater_precision, 
                           summary_95$rater_precision, summary_100$rater_precision),
  mean_TLO = c(mean(summary_80$tlo), mean(summary_85$tlo), mean(summary_90$tlo), mean(summary_95$tlo),
               mean(summary_100$tlo))
)


final_stm <- stm_85

# Explanation: Comparatively high model precision and comparatively high mean TLO.

#---------------------------------------------------------------------------------------------------------------------


